# -*- coding: utf-8 -*-
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim import corpora, models
from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, Similarity


tokenizer = RegexpTokenizer(r'\w+')
texts = []

# loop through document list
for line in open('D:\Implementations\Experiments\Eclipse3.0\Source\Eclipse3.0-AfterSplitStopStem.corpusRawMethodLevelGranularity'):

    texts.append(tokenizer.tokenize(line))

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
#print(dictionary)
# generate LSI model
lsimodel = gensim.models.lsimodel.LsiModel(corpus, id2word = dictionary, num_topics=200)#Deerwester et al.’ 1990
index = MatrixSimilarity(lsimodel[corpus])
for line in open('D:\Implementations\Experiments\Eclipse3.0\Source\queries-AfterSplitStopStem.txt'):
    new_vec = dictionary.doc2bow(tokenizer.tokenize(line))
    doc_lda = lsimodel[new_vec]
    sims = index[doc_lda]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print(sims)